import requests
import re
from Throttle import Throttle

#爬取节气数据

#下载数据
def download(url, user_agent='wswp', num_retries=2, proxies=None):
	print('Downloading:',url)
	headers={'User-Agent': user_agent}
	try:
		resp=requests.get(url, headers=headers, proxies=proxies)
		html=resp.text
		#print(resp.status_code)
		if resp.status_code >=400:
			print('Download error:',resp.text)
			html=None
			if num_retries and 500 <= resp.status_code <600:
				return download(url, num_retries-1)
	except requests.exceptions.RequestException as e:
		print('Download error:', e.reason)
		html=None
	return html

#保存数据
def save2file(data,fo):
	print('save to file,data:',data)
	#t1=('2017年立春时间','2017年2月3日 23:34:01')
	t1=data
	#print(t1[0],t1[1])
	#处理字符串
	s1='年'
	s2='时间'
	s3='月'
	s4='日'
	s1_pos1=t1[0].index(s1)
	s1_pos2=t1[1].index(s1)
	s2_pos=t1[0].index(s2)
	s3_pos=t1[1].index(s3)
	s4_pos=t1[1].index(s4)
	year=t1[0][0:s1_pos1]
	jieqi=t1[0][s1_pos1+1:s2_pos]
	t=t1[1][0:s1_pos2]+'-'+t1[1][s1_pos2+1:s3_pos]+'-'+t1[1][s3_pos+1:s4_pos]+t1[1][s4_pos+1:]
	print(year,jieqi,t)
	sql="insert into jieqi(year,jieqi,jieqitime) values('"+year+"','"+jieqi+"','"+t+"');\n"
	print(sql)
	fo.write(sql)

if __name__=='__main__':
	throttle=Throttle(5)  #时间间隔，单位：秒
	fo=open('jieqi_data.txt','a')
	for y in range(2020,2051):
		url="https://jieqi.911cha.com/"+str(y)+".html"
		throttle.wait(url)
		res=download(url)
		#re_f1='<li>(.*?)</li>'
		re_f2='<span class="apptitle">(.*?)</span><span class="appintro">(.*?)</span>'
		links=re.findall(re_f2,res)
		for data in links:
			print(data)
			save2file(data,fo)
	fo.close()
	